import pandas as pd
import numpy as np
df=pd.read_excel('Student Applications & Performance.xlsx')
df.head()
df.shape
(3400, 56)
#FUNCTION USED INORDER TO SEPERATE NUMERICAL AND CATEGORICAL DATA
%run dqr.py
report=dataQuality(df)
type(report)
dict
report.keys()
dict_keys(['numeric', 'categorical'])
report['numeric']
| Count | Unique | Miss_per | Min | Max | Mean | Median | Std | Var | Q1 | Q3 | q99 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STUDENT IDENTIFIER | 3400.0 | 3400.0 | 0.000000 | 7755837.0 | 8037098.0 | 7.897155e+06 | 7899052.5 | 80834.212657 | 6.534170e+09 | 7827039.25 | 7964008.00 | 8034112.56 |
| STDNT_AGE | 3400.0 | 11.0 | 0.000000 | 16.0 | 26.0 | 1.799412e+01 | 18.0 | 0.552052 | 3.047609e-01 | 18.00 | 18.00 | 19.00 |
| STDNT_TEST_ENTRANCE1 | 1106.0 | 24.0 | 0.674706 | 8.0 | 32.0 | 2.066004e+01 | 20.0 | 3.234734 | 1.046351e+01 | 18.00 | 22.00 | 30.00 |
| STDNT_TEST_ENTRANCE2 | 2492.0 | 110.0 | 0.267059 | 350.0 | 1490.0 | 9.650923e+02 | 990.0 | 209.088633 | 4.371806e+04 | 900.00 | 1100.00 | 1350.90 |
| STDNT_TEST_ENTRANCE_COMB | 2882.0 | 25.0 | 0.152353 | 530.0 | 1510.0 | 9.977585e+02 | 990.0 | 156.268221 | 2.441976e+04 | 910.00 | 1110.00 | 1340.00 |
| FIRST_TERM | 3400.0 | 6.0 | 0.000000 | 200508.0 | 201008.0 | 2.007770e+05 | 200808.0 | 170.001298 | 2.890044e+04 | 200608.00 | 200908.00 | 201008.00 |
| SECOND_TERM | 3400.0 | 6.0 | 0.000000 | 200602.0 | 201102.0 | 2.008710e+05 | 200902.0 | 170.001298 | 2.890044e+04 | 200702.00 | 201002.00 | 201102.00 |
| RETURNED_2ND_YR | 3400.0 | 2.0 | 0.000000 | 0.0 | 1.0 | 7.873529e-01 | 1.0 | 0.409240 | 1.674775e-01 | 1.00 | 1.00 | 1.00 |
| DISTANCE_FROM_HOME | 3375.0 | 72.0 | 0.007353 | 0.0 | 5932.0 | 9.982519e+01 | 69.0 | 235.196984 | 5.531762e+04 | 69.00 | 138.00 | 666.00 |
| HIGH_SCHL_GPA | 3347.0 | 201.0 | 0.015588 | 0.0 | 4.0 | 3.202738e+00 | 3.2 | 0.455337 | 2.073318e-01 | 2.87 | 3.56 | 4.00 |
| FATHER_HI_EDU_CD | 2968.0 | 5.0 | 0.127059 | 1.0 | 4.0 | 2.560647e+00 | 3.0 | 0.681273 | 4.641333e-01 | 2.00 | 3.00 | 4.00 |
| MOTHER_HI_EDU_CD | 2911.0 | 5.0 | 0.143824 | 1.0 | 4.0 | 2.571281e+00 | 3.0 | 0.611729 | 3.742128e-01 | 2.00 | 3.00 | 4.00 |
| FIRST_TERM_ATTEMPT_HRS | 3400.0 | 12.0 | 0.000000 | 9.0 | 21.0 | 1.398588e+01 | 14.0 | 1.554286 | 2.415805e+00 | 13.00 | 15.00 | 18.00 |
| FIRST_TERM_EARNED_HRS | 3400.0 | 22.0 | 0.000000 | 0.0 | 21.0 | 1.220500e+01 | 13.0 | 2.986294 | 8.917951e+00 | 11.00 | 15.00 | 17.00 |
| SECOND_TERM_ATTEMPT_HRS | 3194.0 | 22.0 | 0.060588 | 2.0 | 23.0 | 1.428585e+01 | 14.0 | 2.104387 | 4.428444e+00 | 13.00 | 16.00 | 19.00 |
| SECOND_TERM_EARNED_HRS | 3191.0 | 24.0 | 0.061471 | 0.0 | 23.0 | 1.251113e+01 | 13.0 | 3.470492 | 1.204431e+01 | 11.00 | 15.00 | 19.00 |
| GROSS_FIN_NEED | 3400.0 | 927.0 | 0.000000 | 0.0 | 2124900.0 | 3.034025e+05 | 0.0 | 483625.450816 | 2.338936e+11 | 0.00 | 599580.00 | 1418100.00 |
| COST_OF_ATTEND | 3400.0 | 120.0 | 0.000000 | 0.0 | 2124900.0 | 5.515335e+05 | 150540.0 | 606440.325202 | 3.677699e+11 | 0.00 | 1192815.00 | 1489080.00 |
| EST_FAM_CONTRIBUTION | 3400.0 | 1236.0 | 0.000000 | 0.0 | 5999940.0 | 3.446899e+05 | 0.0 | 787187.774837 | 6.196646e+11 | 0.00 | 324180.00 | 3905646.60 |
| UNMET_NEED | 3400.0 | 1197.0 | 0.000000 | -1212072.0 | 1632660.0 | 7.222052e+04 | 0.0 | 279567.921396 | 7.815822e+10 | 0.00 | 187615.50 | 883023.60 |
##SORTING OUT THE EARLY ATTRITION STUDENTS
att=df[df['RETURNED_2ND_YR']==0]
att
| STUDENT IDENTIFIER | STDNT_AGE | STDNT_GENDER | STDNT_BACKGROUND | IN_STATE_FLAG | INTERNATIONAL_STS | STDNT_MAJOR | STDNT_MINOR | STDNT_TEST_ENTRANCE1 | STDNT_TEST_ENTRANCE2 | ... | DEGREE_GROUP_CD | DEGREE_GROUP_DESC | FIRST_TERM_ATTEMPT_HRS | FIRST_TERM_EARNED_HRS | SECOND_TERM_ATTEMPT_HRS | SECOND_TERM_EARNED_HRS | GROSS_FIN_NEED | COST_OF_ATTEND | EST_FAM_CONTRIBUTION | UNMET_NEED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7808615 | 18 | F | BGD 1 | Y | N | Undeclared | N | NaN | 1150.0 | ... | B | Bachelors | 16 | 16 | 14.0 | 14.0 | 0 | 0 | 0 | 0.0 |
| 11 | 7884573 | 18 | F | BGD 1 | Y | N | Undeclared | N | NaN | 1050.0 | ... | B | Bachelors | 16 | 16 | 16.0 | 16.0 | 0 | 1189920 | 5163240 | -326880.0 |
| 17 | 8004193 | 17 | M | BGD 1 | N | N | Biology | N | NaN | 1040.0 | ... | B | Bachelors | 14 | 14 | 14.0 | 14.0 | 1083000 | 1324500 | 241500 | 476580.0 |
| 23 | 8007842 | 18 | F | BGD 1 | Y | N | Pre-Nursing | N | NaN | 580.0 | ... | B | Bachelors | 15 | 15 | 15.0 | 15.0 | 0 | 0 | 0 | 0.0 |
| 36 | 7799972 | 18 | F | BGD 1 | Y | N | Undeclared | N | NaN | 620.0 | ... | B | Bachelors | 18 | 18 | 14.0 | 14.0 | 0 | 0 | 0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3394 | 7960053 | 18 | F | BGD 1 | Y | N | Theatre Arts | N | NaN | 1010.0 | ... | B | Bachelors | 13 | 3 | NaN | NaN | 0 | 0 | 0 | 0.0 |
| 3395 | 7948112 | 18 | F | BGD 4 | N | N | Music Performance | N | NaN | 800.0 | ... | B | Bachelors | 18 | 18 | 16.0 | 13.0 | 0 | 0 | 0 | 0.0 |
| 3396 | 8023055 | 18 | F | BGD 1 | Y | N | Biology | N | 20.0 | NaN | ... | B | Bachelors | 13 | 9 | NaN | NaN | 0 | 0 | 0 | 0.0 |
| 3397 | 7926915 | 18 | F | BGD 4 | Y | N | Biology | N | NaN | 1020.0 | ... | B | Bachelors | 14 | 6 | NaN | NaN | 0 | 945840 | 1288980 | 0.0 |
| 3399 | 7928405 | 18 | M | BGD 2 | Y | N | Art | Creative Writing | NaN | 890.0 | ... | B | Bachelors | 12 | 9 | 13.0 | 6.0 | 0 | 0 | 0 | 0.0 |
723 rows × 56 columns
import seaborn as sb
EXPLORATORY DATA ANALYSIS
df['RETURNED_2ND_YR'].value_counts()
1 2677 0 723 Name: RETURNED_2ND_YR, dtype: int64
### 723 NON RETURNEES (21%) FROM A BATCH OF 3399 STUDENTS
bygen_famcontribution=df.groupby('RETURNED_2ND_YR')['EST_FAM_CONTRIBUTION'].mean().reset_index()
bygen_famcontribution
| RETURNED_2ND_YR | EST_FAM_CONTRIBUTION | |
|---|---|---|
| 0 | 0 | 315265.643154 |
| 1 | 1 | 352636.787449 |
by_unmetneeds=df.groupby('RETURNED_2ND_YR')['UNMET_NEED'].mean().reset_index()
by_unmetneeds
| RETURNED_2ND_YR | UNMET_NEED | |
|---|---|---|
| 0 | 0 | 90679.460581 |
| 1 | 1 | 67235.153455 |
p=sb.barplot(x='RETURNED_2ND_YR',y='UNMET_NEED',data=by_unmetneeds)
for i,v in by_unmetneeds.iterrows():
p.text(v.name,v.UNMET_NEED,round(v.UNMET_NEED,2),ha='center')
by_highsclgpa=df.groupby('RETURNED_2ND_YR')['HIGH_SCHL_GPA'].mean().reset_index()
by_highsclgpa
| RETURNED_2ND_YR | HIGH_SCHL_GPA | |
|---|---|---|
| 0 | 0 | 3.131802 |
| 1 | 1 | 3.221768 |
by_attempthrs=df.groupby('RETURNED_2ND_YR')['FIRST_TERM_EARNED_HRS'].mean().reset_index()
by_attempthrs
| RETURNED_2ND_YR | FIRST_TERM_EARNED_HRS | |
|---|---|---|
| 0 | 0 | 11.809129 |
| 1 | 1 | 12.311916 |
by_DISTANCE_FROM_HOME=df.groupby('RETURNED_2ND_YR')['DISTANCE_FROM_HOME'].mean().reset_index()
by_DISTANCE_FROM_HOME
| RETURNED_2ND_YR | DISTANCE_FROM_HOME | |
|---|---|---|
| 0 | 0 | 112.214586 |
| 1 | 1 | 96.506762 |
h=sb.barplot(x='RETURNED_2ND_YR',y='DISTANCE_FROM_HOME',data=by_DISTANCE_FROM_HOME)
for i,v in by_DISTANCE_FROM_HOME.iterrows():
p.text(v.name,v.DISTANCE_FROM_HOME,round(v.DISTANCE_FROM_HOME,2),ha='center')
by_STDNT_TEST_ENTRANCE_COMB=df.groupby('RETURNED_2ND_YR')['STDNT_TEST_ENTRANCE_COMB'].mean().reset_index()
by_STDNT_TEST_ENTRANCE_COMB
| RETURNED_2ND_YR | STDNT_TEST_ENTRANCE_COMB | |
|---|---|---|
| 0 | 0 | 999.403974 |
| 1 | 1 | 997.322212 |
report['categorical']
| Count | Unique | Miss_per | Freq_Level | freq_cat_Per | |
|---|---|---|---|---|---|
| STDNT_GENDER | 3400 | 2 | 0 | F | 0.629412 |
| STDNT_BACKGROUND | 3400 | 8 | 0 | BGD 1 | 0.612353 |
| IN_STATE_FLAG | 3400 | 2 | 0 | Y | 0.900882 |
| INTERNATIONAL_STS | 3400 | 2 | 0 | N | 0.992059 |
| STDNT_MAJOR | 3400 | 54 | 0 | Undeclared | 0.135294 |
| STDNT_MINOR | 3400 | 37 | 0 | N | 0.929412 |
| CORE_COURSE_NAME_1_F | 3400 | 69 | 0 | ENGL 1101 | 0.183529 |
| CORE_COURSE_GRADE_1_F | 3400 | 7 | 0 | C | 0.301176 |
| CORE_COURSE_NAME_2_F | 3301 | 75 | 0.0291176 | ENGL 1101 | 0.216471 |
| CORE_COURSE_GRADE_2_F | 3301 | 8 | 0.0291176 | B | 0.393824 |
| CORE_COURSE_NAME_3_F | 2835 | 75 | 0.166176 | ENGL 1101 | 0.167059 |
| CORE_COURSE_GRADE_3_F | 2835 | 7 | 0.166176 | B | 0.339706 |
| CORE_COURSE_NAME_4_F | 1803 | 69 | 0.469706 | ENGL 1101 | 0.0914706 |
| CORE_COURSE_GRADE_4_F | 1803 | 7 | 0.469706 | A | 0.321765 |
| CORE_COURSE_NAME_5_F | 645 | 54 | 0.810294 | ENGL 1101 | 0.0238235 |
| CORE_COURSE_GRADE_5_F | 645 | 5 | 0.810294 | A | 0.147647 |
| CORE_COURSE_NAME_6_F | 128 | 25 | 0.962353 | ITDS 2749 | 0.00588235 |
| CORE_COURSE_GRADE_6_F | 128 | 5 | 0.962353 | A | 0.0323529 |
| CORE_COURSE_NAME_1_S | 3243 | 67 | 0.0461765 | ENGL 1102 | 0.184412 |
| CORE_COURSE_GRADE_1_S | 3168 | 9 | 0.0682353 | C | 0.266471 |
| CORE_COURSE_NAME_2_S | 2961 | 70 | 0.129118 | ENGL 1102 | 0.181471 |
| CORE_COURSE_GRADE_2_S | 2961 | 7 | 0.129118 | B | 0.328824 |
| CORE_COURSE_NAME_3_S | 2362 | 69 | 0.305294 | ENGL 1102 | 0.119118 |
| CORE_COURSE_GRADE_3_S | 2362 | 7 | 0.305294 | A | 0.292647 |
| CORE_COURSE_NAME_4_S | 1355 | 65 | 0.601471 | ENGL 1102 | 0.0582353 |
| CORE_COURSE_GRADE_4_S | 1355 | 6 | 0.601471 | A | 0.226471 |
| CORE_COURSE_NAME_5_S | 450 | 52 | 0.867647 | MUSC 1100 | 0.0129412 |
| CORE_COURSE_GRADE_5_S | 450 | 6 | 0.867647 | A | 0.0891176 |
| CORE_COURSE_NAME_6_S | 81 | 27 | 0.976176 | POLS 1101 | 0.00294118 |
| CORE_COURSE_GRADE_6_S | 81 | 4 | 0.976176 | A | 0.0191176 |
| HOUSING_STS | 3400 | 2 | 0 | Off Campus | 0.582647 |
| HIGH_SCHL_NAME | 3399 | 440 | 0.000294118 | SCHOOL 11 | 0.0732353 |
| FATHER_HI_EDU_DESC | 3400 | 5 | 0 | High School | 0.392941 |
| MOTHER_HI_EDU_DESC | 3400 | 5 | 0 | College/Beyond | 0.441176 |
| DEGREE_GROUP_CD | 3400 | 3 | 0 | B | 0.995294 |
| DEGREE_GROUP_DESC | 3400 | 3 | 0 | Bachelors | 0.995294 |
by_STDNT_GENDER=att.groupby('STDNT_GENDER')['RETURNED_2ND_YR']
by_STDNT_GENDER.value_counts()
STDNT_GENDER RETURNED_2ND_YR F 0 475 M 0 248 Name: RETURNED_2ND_YR, dtype: int64
by_IN_STATE_FLAG=att.groupby('IN_STATE_FLAG')['RETURNED_2ND_YR']
by_IN_STATE_FLAG.value_counts()
IN_STATE_FLAG RETURNED_2ND_YR N 0 89 Y 0 634 Name: RETURNED_2ND_YR, dtype: int64
by_HOUSING_STS=att.groupby('HOUSING_STS')['RETURNED_2ND_YR']
by_HOUSING_STS.value_counts()
HOUSING_STS RETURNED_2ND_YR Off Campus 0 389 On Campus 0 334 Name: RETURNED_2ND_YR, dtype: int64
by_STDNT_MAJOR=att.groupby('STDNT_MAJOR')['RETURNED_2ND_YR']
by_STDNT_MAJOR.value_counts()
STDNT_MAJOR RETURNED_2ND_YR Accounting 0 12 Applied Computer Science 0 5 Art 0 18 Art Education 0 2 Biology 0 82 Biology and Secondary Ed 0 3 Chemistry 0 10 Communication 0 14 Computer Science - Games 0 14 Computer Science - Systems 0 9 Criminal Justice 0 29 Early Childhood Education 0 37 Earth and Space Science 0 1 Engineering Studies 0 15 English Language/Literature 0 9 English and Secondary Ed 0 4 Exercise Science 0 19 Finance 0 8 French 0 1 French with Teacher Cert 0 1 General Business 0 12 General Studies/AS 0 2 Health Science 0 6 Health and Physical Education 0 5 History 0 8 History and Secondary Ed 0 7 Information Technology 0 3 Management 0 15 Management Information Systems 0 7 Marketing 0 15 Mathematics 0 6 Mathematics and Secondary Ed 0 1 Middle Grades Education 0 3 Music 0 6 Music Education 0 8 Music Performance 0 12 Nursing 0 37 Political Science 0 14 Pre-Business 0 32 Pre-Engineering/RETP 0 7 Pre-Nursing 0 47 Psychology 0 43 Sociology 0 4 Spanish with Teacher Cert 0 1 Spec Ed: Gen. Curr. - Reading 0 3 Theatre Arts 0 12 Theatre Education 0 3 Undeclared 0 111 Name: RETURNED_2ND_YR, dtype: int64
by_FATHER_HI_EDU_DESC=att.groupby('FATHER_HI_EDU_DESC')['RETURNED_2ND_YR']
by_FATHER_HI_EDU_DESC.value_counts()
FATHER_HI_EDU_DESC RETURNED_2ND_YR College/Beyond 0 257 High School 0 289 Middle School/Junior High 0 26 No Information 0 103 Other/Unknown 0 48 Name: RETURNED_2ND_YR, dtype: int64
by_MOTHER_HI_EDU_DESC=att.groupby('MOTHER_HI_EDU_DESC')['RETURNED_2ND_YR']
by_MOTHER_HI_EDU_DESC.value_counts().plot(kind='barh',color='blue')
<AxesSubplot:ylabel='MOTHER_HI_EDU_DESC,RETURNED_2ND_YR'>
by_MOTHER_HI_EDU_DESC.value_counts()
MOTHER_HI_EDU_DESC RETURNED_2ND_YR College/Beyond 0 301 High School 0 265 Middle School/Junior High 0 16 No Information 0 123 Other/Unknown 0 18 Name: RETURNED_2ND_YR, dtype: int64
by_STDNT_BACKGROUND=att.groupby('STDNT_BACKGROUND')['RETURNED_2ND_YR']
by_STDNT_BACKGROUND.value_counts().plot(kind='barh',color='purple')
<AxesSubplot:ylabel='STDNT_BACKGROUND,RETURNED_2ND_YR'>
by_STDNT_BACKGROUND.value_counts()
STDNT_BACKGROUND RETURNED_2ND_YR BGD 1 0 477 BGD 2 0 24 BGD 3 0 170 BGD 4 0 26 BGD 5 0 4 BGD 6 0 14 BGD 7 0 7 BGD 8 0 1 Name: RETURNED_2ND_YR, dtype: int64
df.columns
Index(['STUDENT IDENTIFIER', 'STDNT_AGE', 'STDNT_GENDER', 'STDNT_BACKGROUND',
'IN_STATE_FLAG', 'INTERNATIONAL_STS', 'STDNT_MAJOR', 'STDNT_MINOR',
'STDNT_TEST_ENTRANCE1', 'STDNT_TEST_ENTRANCE2',
'STDNT_TEST_ENTRANCE_COMB', 'FIRST_TERM', 'CORE_COURSE_NAME_1_F',
'CORE_COURSE_GRADE_1_F', 'CORE_COURSE_NAME_2_F',
'CORE_COURSE_GRADE_2_F', 'CORE_COURSE_NAME_3_F',
'CORE_COURSE_GRADE_3_F', 'CORE_COURSE_NAME_4_F',
'CORE_COURSE_GRADE_4_F', 'CORE_COURSE_NAME_5_F',
'CORE_COURSE_GRADE_5_F', 'CORE_COURSE_NAME_6_F',
'CORE_COURSE_GRADE_6_F', 'SECOND_TERM', 'CORE_COURSE_NAME_1_S',
'CORE_COURSE_GRADE_1_S', 'CORE_COURSE_NAME_2_S',
'CORE_COURSE_GRADE_2_S', 'CORE_COURSE_NAME_3_S',
'CORE_COURSE_GRADE_3_S', 'CORE_COURSE_NAME_4_S',
'CORE_COURSE_GRADE_4_S', 'CORE_COURSE_NAME_5_S',
'CORE_COURSE_GRADE_5_S', 'CORE_COURSE_NAME_6_S',
'CORE_COURSE_GRADE_6_S', 'HOUSING_STS', 'RETURNED_2ND_YR',
'DISTANCE_FROM_HOME', 'HIGH_SCHL_GPA', 'HIGH_SCHL_NAME',
'FATHER_HI_EDU_CD', 'FATHER_HI_EDU_DESC', 'MOTHER_HI_EDU_CD',
'MOTHER_HI_EDU_DESC', 'DEGREE_GROUP_CD', 'DEGREE_GROUP_DESC',
'FIRST_TERM_ATTEMPT_HRS', 'FIRST_TERM_EARNED_HRS',
'SECOND_TERM_ATTEMPT_HRS', 'SECOND_TERM_EARNED_HRS', 'GROSS_FIN_NEED',
'COST_OF_ATTEND', 'EST_FAM_CONTRIBUTION', 'UNMET_NEED'],
dtype='object')
## PRUNING THE DATA BY REMOVING THE VARIABLES WHICH HAVE LESS EFFECT ON ATTRITION RATE
dm=df.drop(['STDNT_AGE','INTERNATIONAL_STS','STDNT_MINOR','STDNT_TEST_ENTRANCE1', 'STDNT_TEST_ENTRANCE2','FIRST_TERM', 'CORE_COURSE_NAME_1_F',
'CORE_COURSE_GRADE_1_F', 'CORE_COURSE_NAME_2_F',
'CORE_COURSE_GRADE_2_F', 'CORE_COURSE_NAME_3_F',
'CORE_COURSE_GRADE_3_F', 'CORE_COURSE_NAME_4_F',
'CORE_COURSE_GRADE_4_F', 'CORE_COURSE_NAME_5_F',
'CORE_COURSE_GRADE_5_F', 'CORE_COURSE_NAME_6_F',
'CORE_COURSE_GRADE_6_F', 'SECOND_TERM', 'CORE_COURSE_NAME_1_S',
'CORE_COURSE_GRADE_1_S', 'CORE_COURSE_NAME_2_S',
'CORE_COURSE_GRADE_2_S', 'CORE_COURSE_NAME_3_S',
'CORE_COURSE_GRADE_3_S', 'CORE_COURSE_NAME_4_S',
'CORE_COURSE_GRADE_4_S', 'CORE_COURSE_NAME_5_S',
'CORE_COURSE_GRADE_5_S', 'CORE_COURSE_NAME_6_S',
'CORE_COURSE_GRADE_6_S','HIGH_SCHL_NAME',
'FATHER_HI_EDU_CD','MOTHER_HI_EDU_CD', 'DEGREE_GROUP_CD', 'DEGREE_GROUP_DESC',
'FIRST_TERM_ATTEMPT_HRS','FIRST_TERM_EARNED_HRS',
'SECOND_TERM_ATTEMPT_HRS', 'SECOND_TERM_EARNED_HRS', 'GROSS_FIN_NEED','COST_OF_ATTEND'],axis=1)
BUILDING THE MODEL USING RANDOM FOREST
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn import tree
df.isnull().sum()
STUDENT IDENTIFIER 0 STDNT_AGE 0 STDNT_GENDER 0 STDNT_BACKGROUND 0 IN_STATE_FLAG 0 INTERNATIONAL_STS 0 STDNT_MAJOR 0 STDNT_MINOR 0 STDNT_TEST_ENTRANCE1 2294 STDNT_TEST_ENTRANCE2 908 STDNT_TEST_ENTRANCE_COMB 518 FIRST_TERM 0 CORE_COURSE_NAME_1_F 0 CORE_COURSE_GRADE_1_F 0 CORE_COURSE_NAME_2_F 99 CORE_COURSE_GRADE_2_F 99 CORE_COURSE_NAME_3_F 565 CORE_COURSE_GRADE_3_F 565 CORE_COURSE_NAME_4_F 1597 CORE_COURSE_GRADE_4_F 1597 CORE_COURSE_NAME_5_F 2755 CORE_COURSE_GRADE_5_F 2755 CORE_COURSE_NAME_6_F 3272 CORE_COURSE_GRADE_6_F 3272 SECOND_TERM 0 CORE_COURSE_NAME_1_S 157 CORE_COURSE_GRADE_1_S 232 CORE_COURSE_NAME_2_S 439 CORE_COURSE_GRADE_2_S 439 CORE_COURSE_NAME_3_S 1038 CORE_COURSE_GRADE_3_S 1038 CORE_COURSE_NAME_4_S 2045 CORE_COURSE_GRADE_4_S 2045 CORE_COURSE_NAME_5_S 2950 CORE_COURSE_GRADE_5_S 2950 CORE_COURSE_NAME_6_S 3319 CORE_COURSE_GRADE_6_S 3319 HOUSING_STS 0 RETURNED_2ND_YR 0 DISTANCE_FROM_HOME 25 HIGH_SCHL_GPA 53 HIGH_SCHL_NAME 1 FATHER_HI_EDU_CD 432 FATHER_HI_EDU_DESC 0 MOTHER_HI_EDU_CD 489 MOTHER_HI_EDU_DESC 0 DEGREE_GROUP_CD 0 DEGREE_GROUP_DESC 0 FIRST_TERM_ATTEMPT_HRS 0 FIRST_TERM_EARNED_HRS 0 SECOND_TERM_ATTEMPT_HRS 206 SECOND_TERM_EARNED_HRS 209 GROSS_FIN_NEED 0 COST_OF_ATTEND 0 EST_FAM_CONTRIBUTION 0 UNMET_NEED 0 dtype: int64
len(dm.columns)
14
dm.isnull().sum()
STUDENT IDENTIFIER 0 STDNT_GENDER 0 STDNT_BACKGROUND 0 IN_STATE_FLAG 0 STDNT_MAJOR 0 STDNT_TEST_ENTRANCE_COMB 518 HOUSING_STS 0 RETURNED_2ND_YR 0 DISTANCE_FROM_HOME 25 HIGH_SCHL_GPA 53 FATHER_HI_EDU_DESC 0 MOTHER_HI_EDU_DESC 0 EST_FAM_CONTRIBUTION 0 UNMET_NEED 0 dtype: int64
dm=dm.dropna()
dm=pd.get_dummies(dm)
dm.columns
Index(['STUDENT IDENTIFIER', 'STDNT_TEST_ENTRANCE_COMB', 'RETURNED_2ND_YR',
'DISTANCE_FROM_HOME', 'HIGH_SCHL_GPA', 'EST_FAM_CONTRIBUTION',
'UNMET_NEED', 'STDNT_GENDER_F', 'STDNT_GENDER_M',
'STDNT_BACKGROUND_BGD 1', 'STDNT_BACKGROUND_BGD 2',
'STDNT_BACKGROUND_BGD 3', 'STDNT_BACKGROUND_BGD 4',
'STDNT_BACKGROUND_BGD 5', 'STDNT_BACKGROUND_BGD 6',
'STDNT_BACKGROUND_BGD 7', 'STDNT_BACKGROUND_BGD 8', 'IN_STATE_FLAG_N',
'IN_STATE_FLAG_Y', 'STDNT_MAJOR_Accounting',
'STDNT_MAJOR_Applied Computer Science', 'STDNT_MAJOR_Art',
'STDNT_MAJOR_Art Education', 'STDNT_MAJOR_Biology',
'STDNT_MAJOR_Biology and Secondary Ed', 'STDNT_MAJOR_Chemistry',
'STDNT_MAJOR_Chemistry and Secondary Ed', 'STDNT_MAJOR_Communication',
'STDNT_MAJOR_Computer Science - Games',
'STDNT_MAJOR_Computer Science - Systems',
'STDNT_MAJOR_Criminal Justice', 'STDNT_MAJOR_Early Admission - Accel',
'STDNT_MAJOR_Early Childhood Education',
'STDNT_MAJOR_Earth and Space Science',
'STDNT_MAJOR_Engineering Studies',
'STDNT_MAJOR_English Language/Literature',
'STDNT_MAJOR_English and Secondary Ed', 'STDNT_MAJOR_Exercise Science',
'STDNT_MAJOR_Finance', 'STDNT_MAJOR_French',
'STDNT_MAJOR_French with Teacher Cert', 'STDNT_MAJOR_General Business',
'STDNT_MAJOR_General Studies/AS', 'STDNT_MAJOR_Geology',
'STDNT_MAJOR_Health Science',
'STDNT_MAJOR_Health and Physical Education', 'STDNT_MAJOR_History',
'STDNT_MAJOR_History and Secondary Ed',
'STDNT_MAJOR_Information Technology',
'STDNT_MAJOR_Joint Enrollment - Accel', 'STDNT_MAJOR_Liberal Arts',
'STDNT_MAJOR_Management', 'STDNT_MAJOR_Management Information Systems',
'STDNT_MAJOR_Marketing', 'STDNT_MAJOR_Mathematics',
'STDNT_MAJOR_Mathematics and Secondary Ed',
'STDNT_MAJOR_Middle Grades Education', 'STDNT_MAJOR_Music',
'STDNT_MAJOR_Music Education', 'STDNT_MAJOR_Music Performance',
'STDNT_MAJOR_Nursing', 'STDNT_MAJOR_Political Science',
'STDNT_MAJOR_Pre-Business', 'STDNT_MAJOR_Pre-Engineering/RETP',
'STDNT_MAJOR_Pre-Nursing', 'STDNT_MAJOR_Psychology',
'STDNT_MAJOR_Sociology', 'STDNT_MAJOR_Spanish',
'STDNT_MAJOR_Spanish with Teacher Cert',
'STDNT_MAJOR_Spec Ed: Gen. Curr. - Reading', 'STDNT_MAJOR_Theatre Arts',
'STDNT_MAJOR_Theatre Education', 'STDNT_MAJOR_Undeclared',
'HOUSING_STS_Off Campus', 'HOUSING_STS_On Campus',
'FATHER_HI_EDU_DESC_College/Beyond', 'FATHER_HI_EDU_DESC_High School',
'FATHER_HI_EDU_DESC_Middle School/Junior High',
'FATHER_HI_EDU_DESC_No Information', 'FATHER_HI_EDU_DESC_Other/Unknown',
'MOTHER_HI_EDU_DESC_College/Beyond', 'MOTHER_HI_EDU_DESC_High School',
'MOTHER_HI_EDU_DESC_Middle School/Junior High',
'MOTHER_HI_EDU_DESC_No Information',
'MOTHER_HI_EDU_DESC_Other/Unknown'],
dtype='object')
dm=dm.drop(columns=['STDNT_MAJOR_Accounting', 'STDNT_MAJOR_Applied Computer Science',
'STDNT_MAJOR_Art', 'STDNT_MAJOR_Art Education',
'STDNT_MAJOR_Biology and Secondary Ed', 'STDNT_MAJOR_Chemistry',
'STDNT_MAJOR_Chemistry and Secondary Ed', 'STDNT_MAJOR_Communication',
'STDNT_MAJOR_Computer Science - Games',
'STDNT_MAJOR_Computer Science - Systems', 'STDNT_MAJOR_Early Admission - Accel',
'STDNT_MAJOR_Earth and Space Science',
'STDNT_MAJOR_Engineering Studies',
'STDNT_MAJOR_English Language/Literature',
'STDNT_MAJOR_English and Secondary Ed', 'STDNT_MAJOR_Exercise Science', 'STDNT_MAJOR_French',
'STDNT_MAJOR_French with Teacher Cert', 'STDNT_MAJOR_General Business',
'STDNT_MAJOR_General Studies/AS', 'STDNT_MAJOR_Geology',
'STDNT_MAJOR_Health Science',
'STDNT_MAJOR_Health and Physical Education', 'STDNT_MAJOR_History',
'STDNT_MAJOR_History and Secondary Ed',
'STDNT_MAJOR_Information Technology',
'STDNT_MAJOR_Joint Enrollment - Accel', 'STDNT_MAJOR_Liberal Arts',
'STDNT_MAJOR_Management', 'STDNT_MAJOR_Management Information Systems',
'STDNT_MAJOR_Marketing', 'STDNT_MAJOR_Mathematics',
'STDNT_MAJOR_Mathematics and Secondary Ed',
'STDNT_MAJOR_Middle Grades Education', 'STDNT_MAJOR_Music',
'STDNT_MAJOR_Music Education', 'STDNT_MAJOR_Music Performance', 'STDNT_MAJOR_Political Science',
'STDNT_MAJOR_Pre-Engineering/RETP',
'STDNT_MAJOR_Sociology', 'STDNT_MAJOR_Spanish',
'STDNT_MAJOR_Spanish with Teacher Cert',
'STDNT_MAJOR_Spec Ed: Gen. Curr. - Reading', 'STDNT_MAJOR_Theatre Arts',
'STDNT_MAJOR_Theatre Education','STDNT_MAJOR_Undeclared'])
len(dm.columns)
39
dm=dm.drop('STUDENT IDENTIFIER',axis=1)
def clean_dataset(dm):
assert isinstance(dm, pd.DataFrame)
dm.dropna(inplace=True)
indices_to_keep = ~dm.isin([np.nan, np.inf, -np.inf]).any(1)
return df[indices_to_keep].astype(np.float64)
x=dm.drop(columns='RETURNED_2ND_YR')
x
| STDNT_TEST_ENTRANCE_COMB | DISTANCE_FROM_HOME | HIGH_SCHL_GPA | EST_FAM_CONTRIBUTION | UNMET_NEED | STDNT_GENDER_F | STDNT_GENDER_M | STDNT_BACKGROUND_BGD 1 | STDNT_BACKGROUND_BGD 2 | STDNT_BACKGROUND_BGD 3 | ... | FATHER_HI_EDU_DESC_College/Beyond | FATHER_HI_EDU_DESC_High School | FATHER_HI_EDU_DESC_Middle School/Junior High | FATHER_HI_EDU_DESC_No Information | FATHER_HI_EDU_DESC_Other/Unknown | MOTHER_HI_EDU_DESC_College/Beyond | MOTHER_HI_EDU_DESC_High School | MOTHER_HI_EDU_DESC_Middle School/Junior High | MOTHER_HI_EDU_DESC_No Information | MOTHER_HI_EDU_DESC_Other/Unknown | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1150.0 | 150.0 | 4.00 | 0 | 0.0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 1190.0 | 69.0 | 2.89 | 785760 | 459300.0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 1030.0 | 150.0 | 3.39 | 0 | 0.0 | 0 | 1 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 1220.0 | 150.0 | 2.93 | 0 | 0.0 | 0 | 1 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 1190.0 | 69.0 | 3.86 | 519840 | 278340.0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3395 | 790.0 | 871.0 | 3.93 | 0 | 0.0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3396 | 950.0 | 69.0 | 2.69 | 0 | 0.0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3397 | 1030.0 | 90.0 | 2.76 | 1288980 | 0.0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 3398 | 990.0 | 0.0 | 3.56 | 0 | -231720.0 | 1 | 0 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 3399 | 870.0 | 69.0 | 2.63 | 0 | 0.0 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
2832 rows × 37 columns
y=dm['RETURNED_2ND_YR']
x.shape
(2832, 37)
y.shape
(2832,)
X_train,X_test,Y_train,Y_test=train_test_split(x,y,random_state=0)
dm.columns
Index(['STDNT_TEST_ENTRANCE_COMB', 'RETURNED_2ND_YR', 'DISTANCE_FROM_HOME',
'HIGH_SCHL_GPA', 'EST_FAM_CONTRIBUTION', 'UNMET_NEED', 'STDNT_GENDER_F',
'STDNT_GENDER_M', 'STDNT_BACKGROUND_BGD 1', 'STDNT_BACKGROUND_BGD 2',
'STDNT_BACKGROUND_BGD 3', 'STDNT_BACKGROUND_BGD 4',
'STDNT_BACKGROUND_BGD 5', 'STDNT_BACKGROUND_BGD 6',
'STDNT_BACKGROUND_BGD 7', 'STDNT_BACKGROUND_BGD 8', 'IN_STATE_FLAG_N',
'IN_STATE_FLAG_Y', 'STDNT_MAJOR_Biology',
'STDNT_MAJOR_Criminal Justice', 'STDNT_MAJOR_Early Childhood Education',
'STDNT_MAJOR_Finance', 'STDNT_MAJOR_Nursing',
'STDNT_MAJOR_Pre-Business', 'STDNT_MAJOR_Pre-Nursing',
'STDNT_MAJOR_Psychology', 'HOUSING_STS_Off Campus',
'HOUSING_STS_On Campus', 'FATHER_HI_EDU_DESC_College/Beyond',
'FATHER_HI_EDU_DESC_High School',
'FATHER_HI_EDU_DESC_Middle School/Junior High',
'FATHER_HI_EDU_DESC_No Information', 'FATHER_HI_EDU_DESC_Other/Unknown',
'MOTHER_HI_EDU_DESC_College/Beyond', 'MOTHER_HI_EDU_DESC_High School',
'MOTHER_HI_EDU_DESC_Middle School/Junior High',
'MOTHER_HI_EDU_DESC_No Information',
'MOTHER_HI_EDU_DESC_Other/Unknown'],
dtype='object')
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(max_depth=7)
classifier.fit(x,y)
RandomForestClassifier(max_depth=7)
y_pred=classifier.predict(X_test)
print(confusion_matrix(Y_test,y_pred))
print(accuracy_score(Y_test,y_pred))
print(classification_report(Y_test,y_pred))
[[ 3 148]
[ 0 557]]
0.7909604519774012
precision recall f1-score support
0 1.00 0.02 0.04 151
1 0.79 1.00 0.88 557
accuracy 0.79 708
macro avg 0.90 0.51 0.46 708
weighted avg 0.83 0.79 0.70 708
# As you can see the tree is highly biased to the majority value as the data is class imbalanced
#so i am using UNDER SAMPLING inorder to tackle the situation
y.value_counts()
1 2243 0 589 Name: RETURNED_2ND_YR, dtype: int64
count_class_1,count_class_0=dm.RETURNED_2ND_YR.value_counts()
count_class_0,count_class_1
(589, 2243)
dm_class_0=dm[dm['RETURNED_2ND_YR']==0]
dm_class_1=dm[dm['RETURNED_2ND_YR']==1]
dm_class_0.shape
(589, 38)
dm_class_1.shape
(2243, 38)
dm_class_1_under=dm_class_1.sample(count_class_0)
dm_test_under=pd.concat([dm_class_1_under,dm_class_0],axis=0)
dm_test_under.shape
(1178, 38)
x=dm_test_under.drop('RETURNED_2ND_YR',axis=1)
y=dm_test_under['RETURNED_2ND_YR']
X_train,X_test,Y_train,Y_test=train_test_split(x,y,random_state=0,stratify=y)
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=100,max_depth=4)
classifier.fit(x,y)
RandomForestClassifier(max_depth=4)
y_pred=classifier.predict(X_test)
print(confusion_matrix(Y_test,y_pred))
print(accuracy_score(Y_test,y_pred))
print(classification_report(Y_test,y_pred))
[[86 62]
[59 88]]
0.5898305084745763
precision recall f1-score support
0 0.59 0.58 0.59 148
1 0.59 0.60 0.59 147
accuracy 0.59 295
macro avg 0.59 0.59 0.59 295
weighted avg 0.59 0.59 0.59 295
classifier=RandomForestClassifier(n_estimators=100,max_depth=7)
classifier.fit(x,y)
RandomForestClassifier(max_depth=7)
y_pred=classifier.predict(X_test)
print(confusion_matrix(Y_test,y_pred))
print(accuracy_score(Y_test,y_pred))
print(classification_report(Y_test,y_pred))
[[107 41]
[ 22 125]]
0.7864406779661017
precision recall f1-score support
0 0.83 0.72 0.77 148
1 0.75 0.85 0.80 147
accuracy 0.79 295
macro avg 0.79 0.79 0.79 295
weighted avg 0.79 0.79 0.79 295
plt.figure(figsize=(85,15)) tree.plottree(classifier.estimators[1],filled=True)
#THIS MODEL HAS A F1 SCORE OF 79% WHICH MAKES THIS AS THE BEST ONE
USING DECISION TREE CLASSIFIER
clf=DecisionTreeClassifier(random_state=0,max_depth=7)
clf=clf.fit(X_train,Y_train)
plot_confusion_matrix(clf,X_test,Y_test,display_labels=['HAVE NOT RETURNED','RETURNED'])
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x14538368dc0>
clf.score(X_test,Y_test)
0.4576271186440678
# SO RANDOM FOREST IS THE MOST SUITABLE ONE
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(classifier.estimators_[1], out_file='tree.dot',
feature_names = x.columns,
class_names =['HAVE NOT RETURNED','RETURNED'] ,
rounded = True, proportion = False,
filled = True)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')